set.seed(1)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "wordcloud2", "stopwords")
for(i in required_packages) { 
  if(!require(i, character.only = T)) {
    #  if package is not existing, install then load the package
    install.packages(i, dependencies = T)
  require(i, character.only = T)
  }
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))
## theme settings for charts without y axis
theme_blank <- 
  theme(panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        axis.line.y = element_blank(),
        axis.text.y = element_blank())

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")

!!!J: In this version I remove all the entries with zero volume for all stats. I think if we prefer to go by count rather than by volume, this is the best approach.

!!!D: Sounds good. I trust you on this. Please make sure to add a note to the methodology section when moving the files.

Basic stats

Total number of searches
~306 million
Total volume of searches
~303 billion


This table shows the top 10 searches. They are all spelling errors. As in, they are not really searched, but rather people attempting to go to Youtube or Facebook, but typing it wrong. Oddly they are all attributed as having a search volume of exactly 185 million.

Top searches
keyword location spell spell_type keyword_info_search_volume
jou tube 2840 youtube showing_results_for 185000000
youtube the 2840 185000000
youi tue 2840 youtube showing_results_for 185000000
acerook 2840 185000000
youetube 2840 185000000
you tbut 2840 youtube did_you_mean 185000000
ykutube 2840 youtube showing_results_for 185000000
uotod 2840 youtube showing_results_for 185000000
utuen 2840 youtube did_you_mean 185000000
ytu tube 2840 185000000
Missing search volume
0.975%

The missing have some searches that are likely high volume. Thus they are truly missing, and not just 0s.

Keywords with missing search volume
keyword keyword_info_search_volume
Amtrak train schedule NA
Btnh days of our lives Lyrics NA
Oak to den google flights NA
Easter Bunny tracks NA
Downtown Cocktail Room NA
CPU CCD temperature NA
Deer hunting swamp islands NA
Paul Rodgers : acoustic NA
Movies that will change your life Reddit NA
Bear Paw Print Tattoo NA
Mean search volume
988.6631
Median search volume
10
Mean CPC
0.6069549
Median CPC
0


Spell types


About half of search volume has a spell type. This is especially driven by misspellings of common domains.

If going by count instead of by volume, almost none of the searches have a spell type:

Proportion with spell type
1.389%

!!!D: without spell type is missing “No spell type”

!!!J: This is on purpose, since only 1.38% has a spell type, as shown just above.


Top 10 intended searches that are misspelled
spell volume
youtube 35.3%
facebook 8.7%
amazon 7.6%
google 6.3%
weather 2.2%
translate 1.6%
com 1.5%
instagram 1.3%
walmart 1.3%
ebay 1.2%



Questions

Total percentage of searches that are questions
14.085%


Stopwords

!!!D: the graph is not that insightful. Please make the wordcloud a bit more readable and visually appealing. Feel to add more than 25 words.


Search tails



Keyword_info categories

!!!D: I would be curious to see the same graph with median. Wouldn´t it be better to use median given the skewed data set?

!!!J: I am skeptical for using the median, since it is brought far down by a large number of searches with low volume, even when we exclude 0 volume searches. Remember, the median cpc overall was 0. But you are right that it is an issue with the skewed data set.

!!!D: Maybe a boxplot or something similiar would make more sense here? Or something like that: https://tinyurl.com/yxfwo5vt Leave that out if it´s too complicated to implement.





Keyword type

transactional_words <- c("apply", "buy", "coupons", "clearance", "deals", "discount",
                         "download", "for sale", "order", "purchase", "reserve",
                         "schedule appointment", "special")

informational_words <- c("how do", "how does", "how can i", "what is", "what are", "ways to", "guide", 
                         "how to", "tutorial", "best", "cheap", "alternatives", "compare", "improve")

navigational_words <- c("location of", "near me", "features of", "cost of", "hours of", "directions to",
                        "reviews", "free shipping", "prices", "testimonials")

get_keyword_counts <- function(type, wordlist){
  words <- tribble(~keyword, ~n)
  for (word in wordlist){
    print(word)
    sql <- glue(
      "SELECT COUNT(keyword_info_search_volume) AS count
       FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
       WHERE location = 2840 
       AND keyword_info_search_volume > 0
       AND keyword like '% {word} %' OR keyword like '{word} %' OR keyword like '% {word}'")
    tb <- bq_project_query("dataforseo-bigquery", sql)
  
    df <- bq_table_download(tb) 
    words %<>% add_row(keyword = word, n = df$count)
  }
  write_csv(words %>% mutate(type = type), glue("../proc_data/{type}_words.csv"))
}


#get_keyword_counts("transactional", transactional_words)
#get_keyword_counts("informational", informational_words)
#get_keyword_counts("navigational", navigational_words)

df <- bind_rows(
  read_csv("../proc_data/transactional_words.csv"),
  read_csv("../proc_data/informational_words.csv"),
  read_csv("../proc_data/navigational_words.csv")
)

df %>% group_by(type) %>% 
  summarise(prop = sum(n) / total_count) %>% 
  mutate(type = str_to_title(type)) %>% 
  ggplot(aes(x = type, y = prop)) +
  geom_bar(stat = "identity", width = 0.8, fill = "turquoise4", color = "black") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.13), expand = c(0,0)) +
  labs("Keyword type", y = NULL, x = NULL)

!!!D: I would guess most are navigational queries. May be we can add a separate category “navigational - brands”? see a list below. What do you think?

Also, please not what the % is for keywords that don´t fit into any of the cat. and list the keywords in your descriptions for all cat.

navigational_words_brands <- c( Amazon Google Apple Microsoft Samsung Group ICBC Facebook Walmart Ping An Huawei Mercedes-Benz Verizon China Construction Bank AT&T Toyota State Grid Disney Agricultural Bank Of China WeChat Bank of China Home Depot China Mobile Shell Saudi Aramco Volkswagen YouTube Tencent Starbucks Wells Fargo BMW Deutsche Telekom Moutai PetroChina Coca-Cola Mitsubishi Group McDonald’s Taobao NTT Group Bank of America Nike Porsche Sinopec IBM CITI Honda Marlboro Deloitte Chase Tmall UPS American Express Xfinity UnitedHealthcare Sumitomo Group Intel VISA Instagram China Life accenture Allianz CSCEC PWC Lowe’s Mitsui General Electric EY Oracle Cisco BP CVS Total FedEx Netflix China Merchants Bank JP Morgan Boeing Costco SK Group Wuliangye Evergrande Nestle Hyundai Group China Telecom Siemens TATA Group Mastercard Bosch IKEA HSBC Spectrum Vodafone Pepsi Alibaba.com Ford AIA Orange Nissan Chevron GUCCI Dell Technologies


Session Info

## [1] "2020-11-03 14:19:02 CET"
## Local:    master C:/Users/DataVizard/Google Drive/Work/DataViz/Clients/FrontPage-2020-009/keyword-landscape-analysis
## Remote:   master @ origin (https://github.com/frontpagedata/keyword-landscape-analysis.git)
## Head:     [99c1187] 2020-11-02: update
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=German_Germany.1252  LC_CTYPE=German_Germany.1252   
## [3] LC_MONETARY=German_Germany.1252 LC_NUMERIC=C                   
## [5] LC_TIME=C                      
## system code page: 65001
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] stopwords_2.0      wordcloud2_0.2.1   pander_0.6.3       showtext_0.8-1    
##  [5] showtextdb_3.0     sysfonts_0.8.1     pdftools_2.3.1     colorspace_1.4-1  
##  [9] urltools_1.7.3     rmdformats_0.3.7   knitr_1.29         fs_1.5.0          
## [13] mblm_0.12.1        sm_2.2-5.6         DT_0.15            sentimentr_2.7.1  
## [17] tidytext_0.2.5     here_0.1           gganimate_1.0.6    scales_1.1.1      
## [21] plotly_4.9.2.1     extrafont_0.17     hrbrthemes_0.8.6   ggthemes_4.2.0    
## [25] readxl_1.3.1       ggwordcloud_0.5.0  gt_0.2.2           janitor_2.0.1     
## [29] vroom_1.2.1        glue_1.4.1         arrow_2.0.0        bigrquery_1.3.2   
## [33] DBI_1.1.0          magrittr_1.5       forcats_0.5.0      stringr_1.4.0     
## [37] dplyr_1.0.1        purrr_0.3.4        readr_1.3.1        tidyr_1.1.1       
## [41] tibble_3.0.3       ggplot2_3.3.2.9000 tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##  [1] ellipsis_0.3.1    qdapRegex_0.7.2   rprojroot_1.3-2   snakecase_0.11.0 
##  [5] rstudioapi_0.11   farver_2.0.3      scico_1.2.0.9000  SnowballC_0.7.0  
##  [9] bit64_4.0.2       fansi_0.4.1       lubridate_1.7.9   textclean_0.9.3  
## [13] xml2_1.3.2        jsonlite_1.7.0    broom_0.7.0       Rttf2pt1_1.3.8   
## [17] dbplyr_1.4.4      png_0.1-7         compiler_4.0.2    httr_1.4.2       
## [21] backports_1.1.7   assertthat_0.2.1  Matrix_1.2-18     lazyeval_0.2.2   
## [25] gargle_0.5.0      cli_2.0.2         tweenr_1.0.1      htmltools_0.5.0  
## [29] prettyunits_1.1.1 tools_4.0.2       gtable_0.3.0      Rcpp_1.0.5       
## [33] cellranger_1.1.0  lexicon_1.2.1     vctrs_0.3.2       extrafontdb_1.0  
## [37] xfun_0.16         rvest_0.3.6       lifecycle_0.2.0   hms_0.5.3        
## [41] curl_4.3          yaml_2.2.1        sass_0.2.0        gdtools_0.2.2    
## [45] triebeard_0.3.0   stringi_1.4.6     tokenizers_0.2.1  checkmate_2.0.0  
## [49] gifski_0.8.6      rlang_0.4.7       pkgconfig_2.0.3   systemfonts_0.2.3
## [53] evaluate_0.14     lattice_0.20-41   labeling_0.3      htmlwidgets_1.5.1
## [57] bit_4.0.4         tidyselect_1.1.0  bookdown_0.20     R6_2.4.1         
## [61] generics_0.0.2    pillar_1.4.6      haven_2.3.1       withr_2.2.0      
## [65] janeaustenr_0.1.5 modelr_0.1.8      crayon_1.3.4      rmarkdown_2.3    
## [69] usethis_1.6.1     progress_1.2.2    syuzhet_1.0.4     grid_4.0.2       
## [73] data.table_1.13.0 git2r_0.27.1      qpdf_1.1          blob_1.2.1       
## [77] reprex_0.3.0      digest_0.6.25     openssl_1.4.2     munsell_0.5.0    
## [81] viridisLite_0.3.0 askpass_1.1       tcltk_4.0.2